Welcome to the final exam.
If you haven't yet read the instructions, you can do so here.
Please run the following data set in the program(s) that you have written:
dna2.fasta
If you created your program(s) correctly, you will be able to answer the questions below.
In [1]:
    
import os.path
dna2_fasta_file_name = "dna2.fasta"
dna2_fasta_file_path = "./data"
dna2_fasta_file_directory = os.path.join(dna2_fasta_file_path, dna2_fasta_file_name)
print "dna2_fasta_file_directory:%s" % dna2_fasta_file_directory
# open file and attain data from file
try: 
    f = open(dna2_fasta_file_directory, "r")
except Exception as e: 
    print e
    
try: 
    data = f.read()
    # read = > str, readlines = > list, readline = > str
    print "read data from %s successfully." % dna2_fasta_file_directory
except Exception as e: 
    print e
    
    
In [2]:
    
# str.count function
s = "asdfdfsas"
print s.count("s")
    
    
In [3]:
    
# count record in variable "data" according to the special symbol ">"(each record has this symbol)
record_num = data.count(">")
print "record_num:%s" % record_num
# close file
try: 
    f.close()
    print "close %s file successfully." % dna2_fasta_file_directory
except Exception as e:
    print e
    
    
In [4]:
    
# open file and load data
try:
    f = open(dna2_fasta_file_directory, "r")
    data_string_list = f.readlines()
    print "open %s file and load data successfully." % dna2_fasta_file_directory
except Exception as e:
    print e
    
    
In [5]:
    
# generate each record string as a element in list from data_string_list
record_list = []
record_meta_list = []
for string_idx in xrange(len(data_string_list)):
    string = data_string_list[string_idx]
    if string.count(">") == 1:
        record_meta_list.append(string)
        
        if string_idx != 0:
            record_list.append(cur_record)
            
        cur_record = ""
        continue
    else:
        cur_record = cur_record + string
        if string_idx == len(data_string_list) -  1:
            record_list.append(cur_record)
            
print "record num.:%s" % len(record_list)
print "len(record_meta_list):%s" % len(record_meta_list)
    
    
In [6]:
    
# remove special symbols
s = "asadafsdaafs"
print s.replace("a", "")
    
    
In [7]:
    
# remove the LINE BREAK character "\n" in variable record_list(each element is a string)
record_list = map(lambda record_string: record_string.replace("\n", ""), record_list)
    
In [8]:
    
def add(s1, s2):
    return s1+s2
s1_list = range(1, 5, 1) # [1, 2, 3, 4]
s2_list = range(-1, -5, -1)
print s1_list
print s2_list
print map(add, s1_list, s2_list)
    
    
In [9]:
    
record_len_list = map(lambda record_string: len(record_string), record_list)
max_len_record_length = max(record_len_list)
each_record_len_list = map(lambda idx, length: (idx, length), xrange(1, 19), record_len_list)
print "len(record_len_list):%s" % len(record_len_list)
print "record_len_list:%s" % record_len_list
print "max_len_record_length:%s" % max_len_record_length
print "each_record_len_list:%s" % each_record_len_list
    
    
In [10]:
    
min_len_record_length = min(record_len_list)
print "min_len_record_length:%s" % min_len_record_length
    
    
In [11]:
    
s = "asdfa"
print s[3:].find("z")
print s[3:].index("z")
    
    
    
In [12]:
    
a = 0
while 1:
    a += 1
    print a
    if a == 3: break
    
    
In [13]:
    
def get_sequence_string_accroding_2_frame_num(sequence_string, frame_num):
    # generate the sequence string corresponding to frame num.
    if frame_num == 1:
        pass
    elif frame_num == 2:
        sequence_string = sequence_string[1:]
    elif frame_num == 3:
        sequence_string = sequence_string[2:]
    return sequence_string
record_list_according_2_frame_2 = map(
    lambda record: get_sequence_string_accroding_2_frame_num(
        sequence_string = record,
        frame_num = 6
    ), record_list
)
    
In [14]:
    
def get_Ngram_list(sequence_string, gramN):
    # generate Ngram list
    # default gramN = 3, trigram
    Ngram_num = len(sequence_string) / gramN
    Ngram_list = map(
        lambda start_index: sequence_string[start_index:start_index + gramN], xrange(Ngram_num)
    )
    return Ngram_list
Ngram_2d_list = map(
    lambda record: get_Ngram_list(
        sequence_string = record,
        gramN = 3
    ), record_list_according_2_frame_2
)
print map(len, record_list_according_2_frame_2)
    
    
In [15]:
    
def find_length_of_longgest_ORF(Ngram_sequence_list):
    def find_first_target_index_in_list(trigram_string_list, target_string):
        target_index_in_list = -1
        try: 
            target_index_in_list = Ngram_sequence_list.index(target_string)
        except Exception as e:
            return target_index_in_list
        
    
    start_codon = "ATG"
    end_codon_list = ["TAA", "TAG", "TGA"]
    cur_ORF_list = []
    cur_max_length = 0
    cur_max_length_index = -1
    cur_start_index = 0
    cur_end_index = -1
    while 1:
        # find start inedx
        try: 
            cur_start_index = Ngram_sequence_list[cur_start_index:].index(start_codon)
        except Exception as e:
            print e
            return cur_max_length 
        
        # find end index
        end_index_list = map(
            lambda end_codon: find_first_target_index_in_list(Ngram_sequence_list,
                                                              end_codon), 
            end_codon_list
        )
        cur_end_index = min(end_index_list)
        if cur_end_index == -1: return cur_max_length
        # current ORF
        cur_ORF_list = Ngram_sequence_list[cur_start_index: cur_end_index]
        
        # update cur_max_length variable
        if cur_max_length <= len(Ngram_sequence_list[cur_start_index: cur_end_index]):
            cur_max_length = len(Ngram_sequence_list[cur_start_index: cur_end_index])
            cur_max_length_index = cur_start_index
        # exchange end index to start index
        cur_start_index = cur_end_index
        cur_end_index = -1
GG
    
In [ ]:
    
ORF_max_length_for_each_record_list = map(
    lambda Ngram_sequence_list: find_length_of_longgest_ORF(Ngram_sequence_list),
    Ngram_2d_list
)
print len(ORF_max_length_for_each_record_list)
    
In [30]:
    
help(list)
    
    
In [16]:
    
def get_sequence_string_accroding_2_frame_num(sequence_string, frame_num):
    # generate the sequence string corresponding to frame num.
    if frame_num == 1:
        pass
    elif frame_num == 2:
        sequence_string = sequence_string[1:]
    elif frame_num == 3:
        sequence_string = sequence_string[2:]
    return sequence_string
record_list_according_2_frame3 = map(
    lambda sequence_string: get_sequence_string_accroding_2_frame_num(
        sequence_string,
        frame_num = 3
    ), record_list
)
    
In [17]:
    
def get_Ngram_list(sequence_string, gramN):
    # generate Ngram list
    # default gramN = 3, trigram
    Ngram_num = len(sequence_string) / gramN
    Ngram_list = map(
        lambda start_index: sequence_string[start_index:start_index + gramN], xrange(Ngram_num)
    )
    return Ngram_list
Ngram_2d_list = map(
    lambda record: get_Ngram_list(
        sequence_string = record,
        gramN = 3
    ), record_list_according_2_frame3
)
print map(len, record_list_according_2_frame3)
    
    
In [ ]:
    
def generate_len_six_string_list(string):
    segment_length = 6
    len_six_string_list = []
    for idx in xrange(len(string) - segment_length):
        len_six_string_list.append(string[idx:idx + segment_length])
    return len_six_string_list
    
In [ ]:
    
# flatten 2-Dimension list variable
from compiler.ast import flatten
li = [[1, 2], [3], [4, 5, 2]]
print "li:%s" % li
print "flatten(li):%s" % flatten(li)
    
In [ ]:
    
# generate any all 6 length strings for each record from variable "record_list"
length6_2d_list = map(generate_len_six_string_list, record_list)
length6_list = flatten(length6_2d_list)
print "len(length6_list):%s" % len(length6_list)
length6_set = set(length6_list)
print "len(length6_set):%s" % len(length6_set)
    
In [ ]:
    
# statistic about most frequency string of length 6
length6_dict = dict()
for cur_length6_string in length6_set:
    cur_length6_string_exist_list = map(lambda length6_string_in_length6_list: length6_string_in_length6_list.count(cur_length6_string), length6_list)
    cur_length6_string_frequency = sum(cur_length6_string_exist_list)
    length6_dict[cur_length6_string] = cur_length6_string_frequency
    
In [ ]:
    
most_frequency_length6_value = max(length6_dict.values())
print "most_frequency_length6_value:%s" % most_frequency_length6_value
    
In [ ]:
    
def generate_user_defined_length_string_list(string, segment_length):
    len_six_string_list = []
    for idx in xrange(len(string) - segment_length):
        len_six_string_list.append(string[idx:idx + segment_length])
    return len_six_string_list
    
In [ ]:
    
segment_length = 12
# generate any all 12 length strings for each record from variable "record_list"
length12_2d_list = map(
    lambda string: generate_user_defined_length_string_list(string, segment_length), record_list)
length12_list = flatten(length12_2d_list)
print "len(length12_list):%s" % len(length12_list)
length12_set = set(length12_list)
print "len(length12_set):%s" % len(length12_set)
length12_and_length_tuple_list = map(lambda length12: (length12, len(length12)), length12_set)
sorted_length12_and_length_tuple_list = sorted(length12_and_length_tuple_list, key = lambda tup: tup[1])
most_frequency_length12_and_length_tuple = sorted_length12_and_length_tuple_list[0]
print "most_frequency_length12_and_length_tuple:%s" % str(most_frequency_length12_and_length_tuple)
print "most_frequency_length12_and_length_tuple[0:2]:%s" % str(most_frequency_length12_and_length_tuple[0:2])
most_frequency_length12 = most_frequency_length12_and_length_tuple[0]
print "most_frequency_length12:%s" % most_frequency_length12
most_frequency_lenght12_count_in_each_record_list = map(
    lambda record: record.count(most_frequency_length12), record_list)
print "most_frequency_lenght12_count_in_each_record_list:%s" % most_frequency_lenght12_count_in_each_record_list
most_frequency_length12_count_sum = sum(most_frequency_lenght12_count_in_each_record_list)
print "most_frequency_length12_count_sum:%s" % most_frequency_length12_count_sum
    
In [ ]:
    
# CATCGCC
pattern_string = "CATCGCC"
pattern_count_list = map(lambda record_string: record_string.count(pattern_string), record_list)
print "pattern_count_list:%s" % pattern_count_list
print "sum(pattern_count_list):%s" % sum(pattern_count_list)
    
In [ ]:
    
# GCGCGCA
pattern_string = "GCGCGCA"
pattern_count_list = map(lambda record_string: record_string.count(pattern_string), record_list)
print "pattern_count_list:%s" % pattern_count_list
print "sum(pattern_count_list):%s" % sum(pattern_count_list)
    
In [ ]:
    
# TGCGCGC
pattern_string = "TGCGCGC"
pattern_count_list = map(lambda record_string: record_string.count(pattern_string), record_list)
print "pattern_count_list:%s" % pattern_count_list
print "sum(pattern_count_list):%s" % sum(pattern_count_list)
    
In [ ]:
    
# CGCGCCG
pattern_string = "CGCGCCG"
pattern_count_list = map(lambda record_string: record_string.count(pattern_string), record_list)
print "pattern_count_list:%s" % pattern_count_list
print "sum(pattern_count_list):%s" % sum(pattern_count_list)